* ============================================================================ *
* Author: Benjamin Rosche
* Date:   July 14, 2018
* Topic:  Discrete-time survival analysis
* ============================================================================ *

version 15 // indicates version
cls // clears backscroll display buffer
clear all // "start from scratch"
set more off // suppresses -more- prompts

* ============================================================================ *
* Load data
* ============================================================================ *
   
cd "C:\Users\benja\OneDrive - Cornell University\GitHub\sas\prog\analysis\data"
use liss-recoded.dta, clear 
 
* Import in the ICE (flong) format and register imputed variables 
mi import ice
mi register imputed simpc_hh hhsize hhurban income_hh income_hh2 age age2 female eduyr eduyr2 migrantd trust voted opcosts agreeablenessscore joymean valmean burmean joydev valdev burdev

* Recode wave starting at 1
mi xeq: replace wave = wave - 7
mi register regular wave

* Define survival data-set
mi stset wave, failure(dropout=1) id(id) noshow

// no time, linear time, quadratic time, and log-time effect was tested
// quadratic time effect give the best predictive results

mi xeq: gen T = _t0 // linear
mi xeq: gen T2 = _t0*_t0 // quadratic

* ============================================================================ *
* Study2: Calibrating the model
* ============================================================================ *

* Linear predictor
global sas joymean joydev valmean valdev burmean burdev
global sas_trait joymean valmean burmean
global cov female age eduyr migrantd hhtype_sel income_hh hhurban simpc_hh hhsize trust voted opcosts agreeablenessscore 

* DESCRIPTIVES =============================================================== *
mi estimate, esample(finsamp) dots: logistic _d T T2 $cov, vce(cluster id_hh) 
ltable wave dropout if finsamp==1, survival tvid(id)
* ============================================================================ *

// * Only covariates
// mi estimate, saving(miestsurv1, replace) esample(esample) dots: logistic _d T T2 $cov, vce(cluster id_hh) 
// mi estimate using miestsurv1, eform post
//
// mimrgns using miestsurv1, esample(esample) predict(pr) dydx(T T2) over(_t0) 
//
// putexcel set ../output/sa, replace sheet("Covariates")
// ereturn display, eform(hr)
// matrix a = r(table)
// putexcel A1 = "Coefficient"
// putexcel A2 = "SE"
// putexcel A3 = "t statistic"
// putexcel A4 = "p value"
// putexcel A12 = "Attention! Dispersion paramater and level 2 residual are but should not be in eform"
// putexcel B1 = matrix(a)
//
// * Only SAS 
// mi estimate, saving(miestsurv2, replace) esample(esample) dots: logistic _d T T2 $sas, vce(cluster id_hh) // discrete 
// mi estimate using miestsurv2, eform post
//
// putexcel set ../output/sa, modify sheet("SAS")
// ereturn display, eform(eform)
// matrix a = r(table)
// putexcel A1 = "Coefficient"
// putexcel A2 = "SE"
// putexcel A3 = "t statistic"
// putexcel A4 = "p value"
// putexcel A12 = "Attention! Dispersion paramater and level 2 residual are but should not be in eform"
// putexcel B1 = matrix(a)
//
// * SAS and covariates 						
// mi estimate, saving(miestsurv3, replace) esample(esample) dots: logistic _d T T2 $sas $cov, vce(cluster id_hh) // discrete 												
// mi estimate using miestsurv3, eform post
//
// putexcel set ../output/sa, modify sheet("SAS + covariates")
// ereturn display, eform(eform)
// matrix a = r(table)
// putexcel A1 = "Coefficient"
// putexcel A2 = "SE"
// putexcel A3 = "t statistic"
// putexcel A4 = "p value"
// putexcel A12 = "Attention! Dispersion paramater and level 2 residual are in eform but should not be in eform"
// putexcel B1 = matrix(a)
//
// // Standardized coefficients ...
//
// foreach cov of global cov {
// 	mi xeq: egen `cov'_sd = std(`cov')
// }
//
// foreach sas of global sas {
// 	mi xeq: egen `sas'_sd = std(`sas')
// }
//
// global sas_sd joymean_sd joydev_sd valmean_sd valdev_sd burmean_sd burdev_sd
// global cov_sd female_sd age_sd eduyr_sd migrantd_sd hhtype_sel_sd income_hh_sd hhurban_sd simpc_hh_sd hhsize_sd trust_sd voted_sd opcosts_sd agreeablenessscore_sd 
//
// mi estimate, saving(miestsurv3sd, replace) dots: logistic _d T T2 $sas_sd $cov_sd, vce(cluster id_hh) // discrete 												
// mi estimate using miestsurv3sd, eform post
//
// putexcel set ../output/sa_sd, modify sheet("SAS + covariates standardized")
// ereturn display, eform(eform)
// matrix a = r(table)
// putexcel A1 = "Coefficient"
// putexcel A2 = "SE"
// putexcel A3 = "t statistic"
// putexcel A4 = "p value"
// putexcel A12 = "Attention! Dispersion paramater and level 2 residual are in eform but should not be in eform"
// putexcel B1 = matrix(a)

* ============================================================================ *
* Study 3: Predicting panel attrition
* ============================================================================ *

* A) CREATING THE INFORMATION BASE =========================================== *

set more off

// for the survival analysis, we cannot predict dropout at 2014 because whether or
// not someone dropped out is only finally revealed in 2015 (in our definition of
// the dependent variable)

// in the following, I create the variables based on different lengths of information

// 2008 //
mi xeq: bysort id (wave): gen joy08 = joyscore
mi xeq: bysort id (wave): replace joy08 = . if wave > 1

mi xeq: bysort id (wave): gen val08 = valscore
mi xeq: bysort id (wave): replace val08 = . if wave > 1

mi xeq: bysort id (wave): gen bur08 = burscore 
mi xeq: bysort id (wave): replace bur08 = . if wave > 1

mi xeq: bysort id (wave): gen joy08dev = joyscore - joy08[1]
mi xeq: bysort id (wave): gen val08dev = valscore - val08[1]
mi xeq: bysort id (wave): gen bur08dev = burscore - bur08[1]

// 2008-10 //
mi xeq: bysort id (wave): egen joy10 = mean(joyscore) if wave <= 3
mi xeq: bysort id (wave): egen val10 = mean(valscore) if wave <= 3
mi xeq: bysort id (wave): egen bur10 = mean(burscore) if wave <= 3

mi xeq: bysort id (wave): replace joy10 = . if wave > 3 
mi xeq: bysort id (wave): replace val10 = . if wave > 3 
mi xeq: bysort id (wave): replace bur10 = . if wave > 3 

mi xeq: bysort id (wave): gen joy10dev = joyscore - joy10[1]
mi xeq: bysort id (wave): gen val10dev = valscore - val10[1]
mi xeq: bysort id (wave): gen bur10dev = burscore - bur10[1]

// 2008-13 //
// the normal sample goes from 2008-13 ...
// ... $sas ...

foreach cov of global cov {
	mi xeq: bysort id (wave): gen  `cov'08 = `cov'
	mi xeq: bysort id (wave): replace  `cov'08 = . if wave > 1 // covariates with information from 2008
	
	mi xeq: bysort id (wave): gen  `cov'10 = `cov'
	mi xeq: bysort id (wave): replace  `cov'10 = . if wave > 3 // covariates with information till 2010
	
	mi xeq: bysort id (wave): gen  `cov'13 = `cov'
	mi xeq: bysort id (wave): replace  `cov'13 = . if wave > 6 // covariates with information till 2013
}

save sa1_temp.dta, replace

use sa1_temp.dta, clear

* B) PREDICTION ============================================================== *

set more off

* Some observations regarding the predicitons models:
* - When including T and T2 into the models: They will be omitted for the model 
*   using only information from 2008. This is because, essentially, 
*   with only one time point, we cannot model a time process (it will coincide 
*   with the intercept?). But also for those models with information from multiple 
*   time point, we face the problem that as we proceed and aggregate the prediction 
*   to an overall model (Will the obs dropout? And not Pr(T+1|Not in T)), including 
*   the T and T2 does decreases the predicitve power. Therefore, we omit these two 
*   models from the prediction models, i.e.
*   mi estimate, saving(miestsurv1b, replace) dots: logistic _d $cov08, vce(cluster id_hh) and NOT
*   mi estimate, saving(miestsurv1b, replace) dots: logistic _d T T2 $cov08, vce(cluster id_hh) 
* - You will see in that the predicted log-hazards are different in the model 
*   using information 2008-10 and 2008-13 when looking at wave 2010. Of course 
*   the input to the predicted equation must be the same, as all covariates within 
*   one wave are the same, but the estimated regression coefficients differ and 
*   that is why the predicted values differ 

* Creating linear predictors
global sas08 joy08 val08 bur08 
global sas10 joy10 val10 bur10 
global sas joymean valmean burmean  

global cov08 female08 age08 eduyr08 migrantd08 hhtype_sel08 income_hh08 hhurban08 simpc_hh08 hhsize08 trust08 voted08 opcosts08 agreeablenessscore08 
global cov10 female10 age10 eduyr10 migrantd10 hhtype_sel10 income_hh10 hhurban10 simpc_hh10 hhsize10 trust10 voted10 opcosts10 agreeablenessscore10 
global cov13 female13 age13 eduyr13 migrantd13 hhtype_sel13 income_hh13 hhurban13 simpc_hh13 hhsize13 trust13 voted13 opcosts13 agreeablenessscore13 

* Prediction

* M1: covariates only (2013)
mi estimate, saving(m1, replace) dots: logistic _d $cov13, vce(cluster id_hh) 
mi predict pp_m1 using m1, xb storecompleted 

* M2: SAS (trait) only
* 2008
mi estimate, saving(m2_1, replace) dots: logistic _d $sas08, vce(cluster id_hh) 
mi predict pp_m21 using m2_1, xb storecompleted 
* 2008-10
mi estimate, saving(m2_2, replace) dots: logistic _d $sas10, vce(cluster id_hh) 
mi predict pp_m22 using m2_2, xb storecompleted 
* 2008-13
mi estimate, saving(m2_3, replace) dots: logistic _d $sas, vce(cluster id_hh)
mi predict pp_m23 using m2_3, xb storecompleted  

* M3: covariates + SAS (trait) (2008-13)
mi estimate, saving(m3, replace) dots: logistic _d $cov13 $sas, vce(cluster id_hh) 
mi predict pp_m3 using m3, xb storecompleted 

* M4: covariates + SAS (trait + state) (2008-13)
mi estimate, saving(m4, replace) dots: logistic _d $cov13 $sas joydev valdev burdev, vce(cluster id_hh) 
mi predict pp_m4 using m4, xb storecompleted 

* ============================================================================ *
* Quick save
save sa1_temp2.dta, replace
use sa1_temp2.dta, clear
* ============================================================================ *

* We want to take the total number of dropouts into account when predicting dropout
* because the overwhelming majority does not drop out and our model therefore
* predicts only small likelihoods of dropout.
* Thus, we take the 'finaldropN' number of respondents with the highest likelihood
* of dropout and expect them to die...

* 1. Observed survival ======================================================= *
mi xeq: bysort id (wave): egen finaldrop = max(dropout)
mi register regular finaldrop

// mi xeq: bysort id (wave): gen finaldrop_wave = wave*finaldrop
// mi register regular finaldrop_wave

* 2. Expected survival ======================================================= *

* as for each wave a log-survival was predicted,
* we only choose those at wave 2008, 2010, and 2013.
* This is because, the SAS trait in 2010 is based on the
* information between 2008 and 10. Moreover, the covariate
* information is from 2010. Thus, this is a fair comparison.
* Then, we can delete other waves, as from this point on,
* we can solely focus on the expected survival.

mi xeq: gsort +id -wave; by id: replace pp_m1 = pp_m1[_n-1] if pp_m1[_n-1] < . 
mi xeq: gsort +id -wave; by id: replace pp_m21 = pp_m21[_n-1] if pp_m21[_n-1] < . 
mi xeq: gsort +id -wave; by id: replace pp_m22 = pp_m22[_n-1] if pp_m22[_n-1] < . 
mi xeq: gsort +id -wave; by id: replace pp_m23 = pp_m23[_n-1] if pp_m23[_n-1] < . 
mi xeq: gsort +id -wave; by id: replace pp_m3 = pp_m3[_n-1] if pp_m3[_n-1] < . 
mi xeq: gsort +id -wave; by id: replace pp_m4 = pp_m4[_n-1] if pp_m4[_n-1] < . 

mi xeq: gsort +id +wave;

* ============================================================================ *
* Quick save
save sa1_temp3.dta, replace
use sa1_temp3.dta, clear
* ============================================================================ *

* Determine whether respondent participated in 2013-15

gen p2013 = wave==6
gen p2014 = wave==7
gen p2015 = wave==8

mi xeq: bysort id (wave): regen p2013 = max(p2013), replace
mi xeq: bysort id (wave): regen p2014 = max(p2014), replace
mi xeq: bysort id (wave): regen p2015 = max(p2015), replace

* We focus on the subsample that participated in 2013 and then either survived or 
* dropped out in 2014 or 2015.

gen event = ((p2013==1 )& (p2014==0) & (p2015==0)) | ((p2013==1) & (p2015==0))
gen survivors = (finaldrop==0) & (p2013==1) & ((p2014==1) | (p2015==1))

* To calculate the total number of dropouts, we first
* delete all waves but the first one

mi xeq: bysort id (wave): keep if _n == 1

* Keep only those that need to be predicted
keep if event==1 | survivor == 1

* and then sum up the dropout==1...

mi xeq: egen finaldropN = total(event) 
mi xeq: egen finalsurviveN = total(survivor) 
mi xeq: gen N = _N 

* the final number of dropouts is the same across all imputations as it was not
* imputed (for it is a dependent variable)
* Now, we are ready to create the ranking and predict dropout in terms of
* a 0 or a 1:

mi xeq: gsort -pp_m1; gen rank = _n
mi xeq: replace pp_m1 = 0 if pp_m1 != .
mi xeq: replace pp_m1 = 1 if rank <= finaldropN & pp_m1 != .
sort rank 
drop rank

mi xeq: gsort -pp_m21; gen rank = _n
mi xeq: replace pp_m21 = 0 if pp_m21 != .
mi xeq: replace pp_m21 = 1 if rank <= finaldropN & pp_m21 != .
sort rank 
drop rank

mi xeq: gsort -pp_m22; gen rank = _n
mi xeq: replace pp_m22 = 0 if pp_m22 != .
mi xeq: replace pp_m22 = 1 if rank <= finaldropN & pp_m22 != .
sort rank 
drop rank

mi xeq: gsort -pp_m23; gen rank = _n
mi xeq: replace pp_m23 = 0 if pp_m23 != .
mi xeq: replace pp_m23 = 1 if rank <= finaldropN & pp_m23 != .
sort rank 
drop rank

mi xeq: gsort -pp_m3; gen rank = _n
mi xeq: replace pp_m3 = 0 if pp_m3 != .
mi xeq: replace pp_m3 = 1 if rank <= finaldropN & pp_m3 != .
sort rank 
drop rank

mi xeq: gsort -pp_m4; gen rank = _n
mi xeq: replace pp_m4 = 0 if pp_m4 != .
mi xeq: replace pp_m4 = 1 if rank <= finaldropN & pp_m4 != .
sort rank 
drop rank

* ============================================================================ *
* Save 
* ============================================================================ *

/*
save sa1_final, replace
use sa1_final, clear
*/

mi xeq: gen correct_m1 = 0 
mi xeq: replace correct_m1 = 1 if (pp_m1 == finaldrop) //& (finaldrop  == 1)
mi xeq: gen correct_m21 = 0 
mi xeq: replace correct_m21 = 1 if (pp_m21 == finaldrop) //& (finaldrop  == 1)
mi xeq: gen correct_m22 = 0 
mi xeq: replace correct_m22 = 1 if (pp_m22 == finaldrop) //& (finaldrop  == 1)
mi xeq: gen correct_m23 = 0 
mi xeq: replace correct_m23 = 1 if (pp_m23 == finaldrop) //& (finaldrop  == 1)
mi xeq: gen correct_m3 = 0 
mi xeq: replace correct_m3 = 1 if (pp_m3 == finaldrop) //& (finaldrop  == 1)
mi xeq: gen correct_m4 = 0 
mi xeq: replace correct_m4 = 1 if (pp_m4 == finaldrop) //& (finaldrop  == 1)			

sum correct*

* eof
